In [1]:
import pandas as pd
import numpy as np

from sklearn.metrics import confusion_matrix, roc_curve, auc, classification_report, plot_confusion_matrix
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score, KFold

# preprocessing
from sklearn.preprocessing import StandardScaler


# keras
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.optimizers import SGD
from keras.utils.vis_utils import plot_model
import keras.backend as K

# Visualisation libraries

## Text
from colorama import Fore, Back, Style
from IPython.display import Image, display, Markdown, Latex

## seaborn
import seaborn as sns
sns.set_context("paper", rc={"font.size":12,"axes.titlesize":14,"axes.labelsize":12})
sns.set_style("white")

## matplotlib
import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse, Polygon
import matplotlib.gridspec as gridspec
import matplotlib.colors
from pylab import rcParams
plt.style.use('seaborn-whitegrid')
plt.rcParams['figure.figsize'] = 14, 8
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
plt.rcParams['text.color'] = 'k'
%matplotlib inline

## plotly
from plotly.offline import init_notebook_mode, iplot 
import plotly.graph_objs as go
import plotly.offline as py
from plotly.subplots import make_subplots
import plotly.express as px
# Graphics in retina format 
%config InlineBackend.figure_format = 'retina' 

import warnings
warnings.filterwarnings("ignore")
Using TensorFlow backend.

Anomaly Detection

Anomaly detection is a classification process in which rare items, events, or observations in data sets are identified. Learn more about this here. In this article, we investigate Credit Card Fraud Detection dataset from Kaggle.com.

Credit Card Fraud Detection

Context

Credit card companies must be able to recognize fraudulent credit card transactions so that customers are not charged for items that they did not purchase.

Content

The datasets contain transactions made by credit cards in September 2013 by European cardholders. This dataset presents transactions that occurred in two days, where we have 492 frauds out of 284,807 transactions. The dataset is highly unbalanced, the positive class (frauds) account for 0.172% of all transactions. It contains only numerical input variables which are the result of a PCA transformation. Unfortunately, due to confidentiality issues, we cannot provide the original features and more background information about the data. Features V1, V2, … V28 are the principal components obtained with PCA, the only features which have not been transformed with PCA are 'Time' and 'Amount'. Feature 'Time' contains the seconds elapsed between each transaction and the first transaction in the dataset. The feature 'Amount' is the transaction Amount, this feature can be used for example-dependant cost-sensitive learning. Feature 'Class' is the response variable and it takes value 1 in case of fraud and 0 otherwise.

In [2]:
Data = pd.read_csv('Data/creditcard.csv',sep=',')

Col = []
#         Temp = re.findall("(\d+)", s)
for s in Data.columns:
    if any(map(str.isdigit, s)) == True:
        Temp = s.split('V')
        Col.append('V'+ Temp[-1].zfill(2))
    else:
        Col.append(s)

Data.columns = Col
del Col

display(pd.DataFrame(Data.shape, columns = ['Count'], index = ['Attributes', 'Instances']).T)
Attributes Instances
Count 284807 31

Initial Analysis

From the above dataset, we can visualize the following features.

  • Amount
  • Class
  • Time

Transaction Class Distribution

In [3]:
Labels = ['Normal', 'Fraud']
Temp = Data['Class'].value_counts(sort = False).to_frame('Count').reset_index()
Temp.columns = ['Class','Count']
Temp['Class'] = Temp['Class'].map(lambda x: Labels[0] if x == 0 else Labels[1])
Temp['Percentage'] = np.round(100* Temp['Count'].values /Temp['Count'].sum(), 2)
display(Temp.style.hide_index().set_precision(2))

fig = px.bar(Temp, y= 'Class', x= 'Percentage', orientation='h', text = 'Count', color_discrete_sequence= ['Bisque'],
             height= 220)
fig.update_traces(marker_line_color= 'DarkRed', marker_line_width=1.5, opacity=1)
fig.update_traces(texttemplate='%{text:.2}', textposition='inside')
fig.update_layout(uniformtext_minsize= 8, uniformtext_mode='hide')
fig['layout']['xaxis'].update(range=[0, 100])
fig.update_layout(title = 'Transaction Class Distribution', plot_bgcolor= 'white')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.show()
Class Count Percentage
Normal 284315 99.83
Fraud 492 0.17

As can be seen, nearly, 99.83 percent of the dataset are labeled as Normal.

In [4]:
fig, ax = plt.subplots(1, 1, figsize=(16, 6))
_ = ax.hist(Data.loc[Data.Class == 0, 'Amount'], 100, color = '#34495e', hatch = '/', lw = 1.5,
            edgecolor = '#3498db', label = Labels[0])
_ = ax.hist(Data.loc[Data.Class == 1, 'Amount'], 10, Color = '#e74c3c', hatch = '\\', lw = 1.5,
            edgecolor = 'DarkRed', label = Labels[1])
_ = ax.set_xlabel('Amount')
_ = ax.set_ylabel('Frequency (Logarithm Scale)')
_ = ax.set_xlim([0, 2e4])
_ = ax.set_yscale('log')
_ = ax.set_ylim([0, 1e6])
_ = ax.legend(bbox_to_anchor=(1, 1), fontsize=14, ncol=2)

Time vs Amount of Transactions

In [5]:
fig, ax = plt.subplots(1, 1, figsize=(16, 6))
_ = ax.scatter(Data.loc[Data.Class == 0, 'Time'], Data.loc[Data.Class == 0, 'Amount'], s= 30,
               facecolors='SkyBlue', edgecolors='MidnightBlue', alpha = 0.8, label = Labels[0])
_ = ax.scatter(Data.loc[Data.Class == 1, 'Time'], Data.loc[Data.Class == 1, 'Amount'], s= 30,
               facecolors='Orange', edgecolors='DarkRed', alpha = 1, label = Labels[1])
_ = ax.set_xlabel('Time (in seconds)')
_ = ax.set_ylabel('Amount')
_ = ax.set_xlim([-500, Data.Time.max()+500])
_ = ax.set_ylim([-250, 2e4])
_ = ax.legend(bbox_to_anchor=(1, 1), fontsize=14, ncol=2)

Modeling

The Dataset is quite large, we would like to use pandas DataFrame sample feature with using a one-tenth of the data as a sample.

In [6]:
df= Data.sample(frac = 0.1, random_state=1)

def Data_info(Inp, Only_NaN = False):
    Out = Inp.dtypes.to_frame(name='Data Type').sort_values(by=['Data Type'])
    Out = Out.join(Inp.isnull().sum().to_frame(name = 'Number of NaN Values'), how='outer')
    Out['Size'] = Inp.shape[0]
    Out['Percentage'] = np.round(100*(Out['Number of NaN Values']/Inp.shape[0]),2)
    if Only_NaN:
        Out = Out.loc[Out['Number of NaN Values']>0]
    return Out
display(df.head())
Data_info(df)
Time V01 V02 V03 V04 V05 V06 V07 V08 V09 ... V21 V22 V23 V24 V25 V26 V27 V28 Amount Class
169876 119907.0 -0.611712 -0.769705 -0.149759 -0.224877 2.028577 -2.019887 0.292491 -0.523020 0.358468 ... -0.075208 0.045536 0.380739 0.023440 -2.220686 -0.201146 0.066501 0.221180 1.79 0
127467 78340.0 -0.814682 1.319219 1.329415 0.027273 -0.284871 -0.653985 0.321552 0.435975 -0.704298 ... -0.128619 -0.368565 0.090660 0.401147 -0.261034 0.080621 0.162427 0.059456 1.98 0
137900 82382.0 -0.318193 1.118618 0.969864 -0.127052 0.569563 -0.532484 0.706252 -0.064966 -0.463271 ... -0.305402 -0.774704 -0.123884 -0.495687 -0.018148 0.121679 0.249050 0.092516 0.89 0
21513 31717.0 -1.328271 1.018378 1.775426 -1.574193 -0.117696 -0.457733 0.681867 -0.031641 0.383872 ... -0.220815 -0.419013 -0.239197 0.009967 0.232829 0.814177 0.098797 -0.004273 15.98 0
134700 80923.0 1.276712 0.617120 -0.578014 0.879173 0.061706 -1.472002 0.373692 -0.287204 -0.084482 ... -0.160161 -0.430404 -0.076738 0.258708 0.552170 0.370701 -0.034255 0.041709 0.76 0

5 rows × 31 columns

Out[6]:
Data Type Number of NaN Values Size Percentage
Amount float64 0 28481 0.0
Class int64 0 28481 0.0
Time float64 0 28481 0.0
V01 float64 0 28481 0.0
V02 float64 0 28481 0.0
V03 float64 0 28481 0.0
V04 float64 0 28481 0.0
V05 float64 0 28481 0.0
V06 float64 0 28481 0.0
V07 float64 0 28481 0.0
V08 float64 0 28481 0.0
V09 float64 0 28481 0.0
V10 float64 0 28481 0.0
V11 float64 0 28481 0.0
V12 float64 0 28481 0.0
V13 float64 0 28481 0.0
V14 float64 0 28481 0.0
V15 float64 0 28481 0.0
V16 float64 0 28481 0.0
V17 float64 0 28481 0.0
V18 float64 0 28481 0.0
V19 float64 0 28481 0.0
V20 float64 0 28481 0.0
V21 float64 0 28481 0.0
V22 float64 0 28481 0.0
V23 float64 0 28481 0.0
V24 float64 0 28481 0.0
V25 float64 0 28481 0.0
V26 float64 0 28481 0.0
V27 float64 0 28481 0.0
V28 float64 0 28481 0.0

Data Correlations

First off, let's define $X$ and $y$ sets.

In [7]:
Target = 'Class'
X = df.drop(columns = [Target])
y = df[Target]

Now, let's take a look at the variance of the features.

In [8]:
display(X.var().sort_values(ascending = False).to_frame(name= 'Variance').T.style.set_precision(2))
Time Amount V01 V02 V03 V04 V05 V06 V07 V08 V09 V10 V11 V13 V12 V14 V15 V16 V17 V18 V19 V20 V21 V22 V23 V24 V25 V26 V27 V28
Variance 2264306246.92 73383.90 3.98 2.92 2.32 2.02 1.95 1.78 1.53 1.45 1.21 1.16 1.04 1.00 1.00 0.92 0.83 0.77 0.73 0.71 0.67 0.65 0.55 0.53 0.42 0.36 0.27 0.24 0.17 0.10

As can see some of the variables have high variance and this is not desirable for our modeling. Thus, we would like to standardize features by removing the mean and scaling to unit variance. In this article, we demonstrated the benefits of scaling data using StandardScaler().

In [9]:
X = StandardScaler().fit_transform(X)

Correlations of features with Class.

In [10]:
Temp = pd.DataFrame(X, columns = df.drop(columns = [Target]).columns)
Temp[Target] = y

def Correlation_Plot (Df,Fig_Size):
    Correlation_Matrix = Df.corr().round(2)
    mask = np.zeros_like(Correlation_Matrix)
    mask[np.triu_indices_from(mask)] = True
    for i in range(len(mask)):
        mask[i,i]=0
    Fig, ax = plt.subplots(figsize=(Fig_Size,Fig_Size))
    sns.heatmap(Correlation_Matrix, ax=ax, mask=mask, annot=True, square=True, 
                cmap =sns.color_palette("Greens", n_colors=10), linewidths = 0.2, vmin=0, vmax=1, cbar_kws={"shrink": .6})

Correlation_Plot (Temp, 16)

Train and Test Sets

In [11]:
y = pd.get_dummies(df[Target]).astype(int)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
pd.DataFrame(data={'Set':['X_train','X_test','y_train','y_test'],
               'Shape':[X_train.shape, X_test.shape, y_train.shape, y_test.shape]}).set_index('Set').T
Out[11]:
Set X_train X_test y_train y_test
Shape (19936, 30) (8545, 30) (19936, 2) (8545, 2)

Keras Sequential model

Here, we implement an artificial neural network (ANN) using Keras Sequential model.

In [12]:
model = Sequential()
model.add(Dense(16, input_dim= X.shape[1], init='uniform', activation='sigmoid', name='Layer1'))
model.add(Dense(8, init='uniform', activation='sigmoid', name='Layer2'))
model.add(Dense(y.shape[1], init='uniform', activation='sigmoid', name='Layer3'))
# Number of iterations
IT = 121

model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy','mae', 'mse'])

# Train model
history = model.fit(X_train, y_train, nb_epoch= IT, batch_size=10,  verbose=0)
# Predications and Score
y_pred = model.predict(X_test)
score = model.evaluate(X_test, y_test) 
8545/8545 [==============================] - 0s 12us/step

Accuracy

In [13]:
score = pd.DataFrame(score, index = model.metrics_names).T
history = pd.DataFrame(history.history)
display(score.style.hide_index())
loss accuracy mae mse
0.007681 0.999181 0.499290 0.498883
In [14]:
fig = go.Figure()
fig.add_trace(go.Scatter(x= history.index.values, y= history['loss'].values, line=dict(color='OrangeRed', width= 1.5), 
                         name = 'Loss'))
fig.add_trace(go.Scatter(x= history.index.values, y= history['accuracy'].values, line=dict(color='MidnightBlue', width= 1.5), 
                         name = 'Accuracy'))
fig.add_trace(go.Scatter(x= history.index.values, y= history['mae'].values, line=dict(color='ForestGreen', width= 1.5), 
                         name = 'Mean Absolute Error (MAE)'))
fig.add_trace(go.Scatter(x= history.index.values, y= history['mse'].values, line=dict(color='purple', width= 1.5), 
                         name = 'Mean Squared Error (MSE)'))
fig.update_layout(legend=dict(y=0.5, traceorder='reversed', font_size=12))
fig.update_layout(dragmode='select', plot_bgcolor= 'white', height=600, hovermode='closest')
fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig['layout']['xaxis'].update(range=[0, history.index.values.max()])
# fig['layout']['yaxis'].update(range=[0, 1.6])
fig.show()

A Graph of the Model

In [15]:
plot_model(model, show_shapes=True, show_layer_names=True, expand_nested = True)
Out[15]:

Using the following code gives our model diagram

from ann_visualizer.visualize import ann_viz
from pdf2image import convert_from_path
ann_viz(model, filename = 'Model01',title="The Model");
for Img in convert_from_path('Model01.pdf'):
    Img.save('Model01.jpg', 'JPEG')

Next, we can plot confusion matrix for our classifier.

In [16]:
fig, ax = plt.subplots(1, 2, figsize=(15, 5))
Confusion_Matrix = confusion_matrix(y_test.idxmax(axis=1),
                                    pd.DataFrame(np.round(y_pred),columns = y_test.columns).astype(int).idxmax(axis=1))

_ = sns.heatmap(Confusion_Matrix, annot=True, annot_kws={"size": 14}, cmap="Blues", ax = ax[0])
_ = ax[0].set_xlabel('Predicted labels')
_ = ax[0].set_ylabel('True labels'); 
_ = ax[0].set_title('Confusion Matrix');
_ = ax[0].xaxis.set_ticklabels(Labels)
_ = ax[0].yaxis.set_ticklabels(Labels)

_ = sns.heatmap(Confusion_Matrix.astype('float') / Confusion_Matrix.sum(axis=1)[:, np.newaxis],\
                annot=True, annot_kws={"size": 14}, cmap="Blues", ax = ax[1])
_ = ax[1].set_xlabel('Predicted labels')
_ = ax[1].set_ylabel('True labels'); 
_ = ax[1].set_title('Normalized Confusion Matrix');
_ = ax[1].xaxis.set_ticklabels(Labels)
_ = ax[1].yaxis.set_ticklabels(Labels)